In this markdown, we will visualize the centroids of a given cluster, and see if this is an appropriate way to look at how the clusters shift around. Let’s get started.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/tylerburns/Desktop/projects/tech_projects/active/cluster_stability
library(flowCore)
library(sleepwalk)
set.seed(1)
setwd(here::here('data'))
# Read in the cells
cells <- flowCore::read.FCS(list.files(pattern = "SLE")) # Data from Marie Burns
params <- as.vector(Biobase::pData(parameters(cells))$desc)
colnames(cells) <- params
cells <- exprs(cells)
cells <- cells[,!is.na(colnames(cells))]
cells <- as_tibble(cells)
# Filter the cells by marker we're going to use
marker_info <- readr::read_csv("cytof_marker_data.csv")
## Rows: 62 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, desc, marker_type, notes
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
marker_info <- dplyr::filter(marker_info, is.na(notes)) # Take this out later
# Tranform the markers we want to transform
to_transform <- dplyr::filter(marker_info, marker_type != "none")$desc
keep_same <- dplyr::filter(marker_info, marker_type == "none")$desc %>% .[!is.na(.)]
tmp1 <- cells[,to_transform]
tmp2 <- cells[,keep_same]
tmp1 <- asinh(tmp1/5)
cells <- bind_cols(tmp1, tmp2)
surface <- cells[,dplyr::filter(marker_info, marker_type == "type")$desc] %>%
as_tibble()
surface
## # A tibble: 240,428 × 28
## `209Bi_CD11b` `140Ce_CD14` `161Dy_CD7` `162Dy_IgM` `163Dy_CCR7` `167Er_CD38`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.27 0.681 0.535 1.82 0.768 2.93
## 2 3.04 0.0814 0 0 0.566 2.75
## 3 1.36 0.950 3.19 1.03 0.853 3.74
## 4 3.22 1.31 0.679 0.965 0.551 3.53
## 5 3.20 0.144 0.607 2.16 0.0316 3.89
## 6 0 0.648 3.56 2.01 1.91 0.575
## 7 0 0.418 3.13 1.67 0.388 0.517
## 8 0 0 4.65 3.11 3.15 1.04
## 9 3.20 0.705 0.0979 1.73 0.401 2.17
## 10 0.0155 0.249 4.34 2.03 3.43 4.64
## # ℹ 240,418 more rows
## # ℹ 22 more variables: `168Er_CD16` <dbl>, `151Eu_CD123` <dbl>,
## # `155Gd_CD27` <dbl>, `160Gd_CD11c` <dbl>, `113In_CD66b` <dbl>,
## # `175Lu_CXCR3` <dbl>, `143Nd_CD19` <dbl>, `145Nd_CD4` <dbl>,
## # `146Nd_CD45RO` <dbl>, `148Nd_IgA` <dbl>, `141Pr_CD127` <dbl>,
## # `195Pt_CD3` <dbl>, `196Pt_CD8` <dbl>, `198Pt_CD45` <dbl>,
## # `147Sm_CD20` <dbl>, `152Sm_CD45RA` <dbl>, `154Sm_CD1c` <dbl>, …
NumberDuplicates <- function(x) {
# Count occurrences and make adjustments if necessary
ux <- unique(x)
for (i in ux) {
# Indices of each unique string
indices <- which(x == i)
if (length(indices) > 1) {
# Modify elements at those indices
x[indices] <- paste0(i, seq_along(indices))
}
}
return(x)
}
# Make the naming easier
names(surface) <- sub(".*_", "", names(surface)) %>% NumberDuplicates()
names(surface)
## [1] "CD11b" "CD14" "CD7" "IgM" "CCR7" "CD38" "CD16" "CD123"
## [9] "CD27" "CD11c" "CD66b" "CXCR3" "CD19" "CD4" "CD45RO" "IgA"
## [17] "CD127" "CD3" "CD8" "CD45" "CD20" "CD45RA" "CD1c" "CD25"
## [25] "CD15" "IgD" "HLA-DR" "CD56"
We gate and sample.
# Mononuclear gate
surface <- dplyr::filter(surface, CD45 > 3 & CD66b < 2)
# Subsample
num_cells <- 10000
surface <- surface[sample(nrow(surface), num_cells),]
surface
## # A tibble: 10,000 × 28
## CD11b CD14 CD7 IgM CCR7 CD38 CD16 CD123 CD27 CD11c CD66b
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00803 0.744 1.95 0.651 1.74 0.391 1.34 0.0282 4.22 0 0.0810
## 2 3.15 0.0396 0.567 0 0 1.41 3.89 0 0 0.654 1.80
## 3 0.205 0.595 2.30 0 1.63 3.74 0 0 2.49 0 0
## 4 0.183 1.09 1.23 0 2.24 0.890 0 0 3.22 0.506 1.09
## 5 0.241 0.379 3.01 0.668 2.26 3.91 0 0 4.26 0 0
## 6 2.73 0.258 0 0 0 1.08 1.99 0 0 0.804 1.94
## 7 0.523 0.280 5.28 1.13 0 4.74 0.484 0 2.26 0.134 0.364
## 8 0.0694 0.636 3.45 0.736 1.54 0 0.762 0.525 4.33 0 0.775
## 9 0.631 0 3.09 0.715 0.353 1.64 0 0.0789 0 0.0264 0
## 10 0.0138 0.637 2.79 0.931 1.29 3.65 0 0 3.76 0.0233 0
## # ℹ 9,990 more rows
## # ℹ 17 more variables: CXCR3 <dbl>, CD19 <dbl>, CD4 <dbl>, CD45RO <dbl>,
## # IgA <dbl>, CD127 <dbl>, CD3 <dbl>, CD8 <dbl>, CD45 <dbl>, CD20 <dbl>,
## # CD45RA <dbl>, CD1c <dbl>, CD25 <dbl>, CD15 <dbl>, IgD <dbl>,
## # `HLA-DR` <dbl>, CD56 <dbl>
And now we cluster.
library(FlowSOM)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following object is masked from 'package:flowCore':
##
## normalize
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
## Thanks for using FlowSOM. From version 2.1.4 on, the scale
## parameter in the FlowSOM function defaults to FALSE
input <- FlowSOM::ReadInput(as.matrix(surface))
som <- FlowSOM::BuildSOM(input)
## Building SOM
## Mapping data to SOM
mc <- FlowSOM::metaClustering_consensus(som$map$codes, k = 30)
mc_cells <- FlowSOM::GetMetaclusters(som, mc)
And now we see how to pull out or compute the cluster centroids. Does FlowSOM give us this out of the box? It looks like we will have to compute them ourselves. How do we do that?
Two options: - Calculate the centroids in the original manifold, make them into rows, run the UMAP on them. - Calculate the centroids directly on the UMAP.
Both ways use the following method: - Calculate the mean value of each coordinate.
Given that this is a visualization exercise, we are going to calculate them directly on the UMAP. First, we need to make the UMAP.
library(umap)
umap <- umap::umap(surface, preserve.seed = FALSE)$layout %>% as_tibble()
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
names(umap) <- c("umap1", "umap2")
Great. Now let’s go ahead and plot what we’ve got.
toplot <- bind_cols(umap, cluster = mc_cells)
ggplot(toplot, aes(x = umap1, y = umap2, color = as.factor(cluster))) + geom_point()
Looks fine. Now it’s centroid compute time.
clust <- unique(mc)
clust_cent <- lapply(clust, function(i) {
result <- dplyr::filter(toplot, mc_cells == i) %>% apply(., 2, median)
return(result)
}) %>% do.call(rbind, .) %>% as_tibble()
And now we plot it.
ggplot(clust_cent, aes(x = umap1, y = umap2)) + geom_point()
Ok, now the big challenge: is there somehow a way I can superimpose one plot on top of another. I want the cluster centroids to be real big with a backdrop of a UMAP in the background. This will be the gif. It looks like we can superimpose the plots directly in the command. Let’s try that.
ggplot() +
geom_point(data = toplot, aes(x = umap1, y = umap2), color = "black", alpha = 0.9) +
geom_point(data = clust_cent, aes(x = umap1, y = umap2), color = "yellow", size = 2)
Ok, this will do! Now its time to run this a bunch of times in a loop, make the gif, and order by similarity.
But first, just as a simple sanity check we color by Cluster ID.
ggplot() +
geom_point(data = toplot, aes(x = umap1, y = umap2), color = as.factor(toplot$cluster)) +
geom_point(data = clust_cent, aes(x = umap1, y = umap2), color = "yellow", size = 2)
Let’s find that “offshore” cluster by visualizing each cluster one by one.
for(i in seq(length(clust_cent$cluster))) {
tmp <- dplyr::filter(toplot, cluster == i)
tmp_cent <- dplyr::filter(clust_cent, clust == i)
p <- ggplot() +
geom_point(data = tmp, aes(x = umap1, y = umap2), color = "black", alpha = 0.9) +
geom_point(data = tmp_cent, aes(x = umap1, y = umap2), color = "yellow", size = 2)
print(p)
}